In [61]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.offline as pyo
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
In [2]:
plt.rcParams["figure.figsize"] = (12,6)
In [3]:
data = pd.read_csv("hotel_bookings.csv")
In [4]:
data.head(5)
Out[4]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 ... No Deposit NaN NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 ... No Deposit 304.0 NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 ... No Deposit 240.0 NaN 0 Transient 98.0 0 1 Check-Out 2015-07-03

5 rows × 32 columns

In [5]:
data.shape
Out[5]:
(119390, 32)

Finding the columns having null values

In [6]:
data.isna().sum()
Out[6]:
hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company                           112593
days_in_waiting_list                   0
customer_type                          0
adr                                    0
required_car_parking_spaces            0
total_of_special_requests              0
reservation_status                     0
reservation_status_date                0
dtype: int64
In [7]:
# Getting info about the columns and their data type

data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal                            119390 non-null  object 
 13  country                         118902 non-null  object 
 14  market_segment                  119390 non-null  object 
 15  distribution_channel            119390 non-null  object 
 16  is_repeated_guest               119390 non-null  int64  
 17  previous_cancellations          119390 non-null  int64  
 18  previous_bookings_not_canceled  119390 non-null  int64  
 19  reserved_room_type              119390 non-null  object 
 20  assigned_room_type              119390 non-null  object 
 21  booking_changes                 119390 non-null  int64  
 22  deposit_type                    119390 non-null  object 
 23  agent                           103050 non-null  float64
 24  company                         6797 non-null    float64
 25  days_in_waiting_list            119390 non-null  int64  
 26  customer_type                   119390 non-null  object 
 27  adr                             119390 non-null  float64
 28  required_car_parking_spaces     119390 non-null  int64  
 29  total_of_special_requests       119390 non-null  int64  
 30  reservation_status              119390 non-null  object 
 31  reservation_status_date         119390 non-null  object 
dtypes: float64(4), int64(16), object(12)
memory usage: 29.1+ MB

The children column has 4 null values, replacing them by zero

In [8]:
data["children"].fillna(0, inplace = True)
In [9]:
data["children"] = data["children"].astype(int)

Removing the rows having zero guests

In [10]:
filter = (data["adults"] == 0) & (data["children"] == 0) & (data["babies"] == 0)
In [11]:
data = data[~filter]
In [12]:
data.shape
Out[12]:
(119210, 32)
In [13]:
df = data.copy()
In [14]:
df['datetime']  = (pd.to_datetime(df['arrival_date_year'].astype(str) + '-' +
                                  df['arrival_date_month'].astype(str) + '-' +
                                  df['arrival_date_day_of_month'].astype(str)))
In [15]:
df['month-year'] = df['datetime'].dt.strftime('%Y-%m')
In [16]:
df = df.sort_values('datetime')
In [17]:
## Replacing Undefined meal type with SC

df["meal"].replace("Undefined", "SC", inplace = True)
In [18]:
df["Total Guests"] = df["adults"] + df["children"] + df["babies"]
df["Total Stay"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]
In [19]:
## Splitting into resort and city hotels

ch = df[df["hotel"] == "City Hotel"]
rh = df[df["hotel"] == "Resort Hotel"]

Exploratory Data Analysis

In [20]:
rh_status = rh["reservation_status"].value_counts().plot(kind = "bar")
In [21]:
rh["is_canceled"].value_counts()
Out[21]:
0    28927
1    11120
Name: is_canceled, dtype: int64
In [22]:
rh["reservation_status"].value_counts()
Out[22]:
Check-Out    28927
Canceled     10829
No-Show        291
Name: reservation_status, dtype: int64
In [23]:
rh_checked = rh[rh["is_canceled"] == 0]
In [24]:
rh_checked["is_canceled"].value_counts()
Out[24]:
0    28927
Name: is_canceled, dtype: int64
In [25]:
rh_time = pd.DataFrame(rh_checked.groupby("month-year").size(), columns = ["count"])
In [26]:
# rh_time
In [27]:
fig = px.line(rh_time, x=rh_time.index, y="count", template = "plotly_dark", title = "Time Wise Count of guests checking in at Resort Hotel")
In [28]:
fig.show(template = "plotly_dark")
In [29]:
ch_checked = ch[ch["is_canceled"] == 0]
ch_time = pd.DataFrame(ch_checked.groupby("month-year").size(), columns = ["count"])
fig = px.line(ch_time, x=ch_time.index, y="count", template = "plotly_dark", title = "Time wise count of guests that checked in at City hotel")
fig.show(template = "plotly_dark")

Country wise Analysis of guests that checked in

In [30]:
rh_country = rh_checked["country"].value_counts().to_frame()
total_rh = len(rh_checked)
rh_country["Percentage of count"] = round((rh_country["country"]/total_rh)*100,2)
In [31]:
rh_country.head(5)
Out[31]:
country Percentage of count
PRT 10184 35.21
GBR 5922 20.47
ESP 3105 10.73
IRL 1734 5.99
FRA 1399 4.84
In [32]:
fig_map = px.choropleth(rh_country, locations = rh_country.index, color = "Percentage of count", hover_data = [rh_country.index, rh_country.country], color_continuous_scale=px.colors.sequential.Plasma,
                    title="Density of origin countries of guests - Resort Hotel", template = "plotly_dark")
In [33]:
fig_map.show()

Inference : Most check-ins in the Resort Hotel are from Portugal, UK and Spain

In [34]:
ch_country = ch_checked["country"].value_counts().to_frame()
total_ch = len(ch_checked)
ch_country["Percentage count"] = round((ch_country["country"]/total_ch)*100, 2)
fig_map = px.choropleth(ch_country, locations = ch_country.index, color = "Percentage count", hover_data = [ch_country.index, ch_country.country], color_continuous_scale=px.colors.sequential.Plasma,
                    title="Density of origin countries of guests - City Hotel", template = "plotly_dark")

fig_map.show()

Inference : Most check-ins in the City Hotel are from Portugal, France and Germany

In [37]:
## Determining which countries have the most cancellations

rh_canceled = rh[rh["is_canceled"]==1]
ch_canceled = ch[ch["is_canceled"]==1]
rh_canceled_country = rh_canceled["country"].value_counts().to_frame()[:10]
rh_canceled_country["Percentage"] = round((rh_canceled_country["country"]/len(rh_canceled))*100,2)
ch_canceled_country = ch_canceled["country"].value_counts().to_frame()[:10]
ch_canceled_country["Percentage"] = round((ch_canceled_country["country"]/len(ch_canceled))*100, 2)
In [38]:
fig = px.bar(rh_canceled_country, x=rh_canceled_country.index, y='Percentage',hover_data = [rh_canceled_country.country], title = "Top 10 countries with most cancellations - Resort Hotel", template = "plotly_dark")
fig.show()
In [39]:
fig = px.bar(ch_canceled_country, x=ch_canceled_country.index, y='Percentage',hover_data = [ch_canceled_country.country], title = "Top 10 countries with most cancellations - City Hotel", template = "plotly_dark")
fig.show()

Comparing Average Daily Rate of Resort and City Hotel according to room type

In [40]:
df_checked = df[df["is_canceled"] == 0]
In [41]:
fig = px.box(df_checked, x="reserved_room_type", y="adr", color="hotel", template = "plotly_dark")
fig.show()
In [42]:
## Trend of average daily rate over time

df_adr_time = df_checked.groupby(["month-year", "hotel"])["adr"].mean().to_frame().reset_index()
df_adr_time.head(5)
Out[42]:
month-year hotel adr
0 2015-07 City Hotel 73.913545
1 2015-07 Resort Hotel 125.220860
2 2015-08 City Hotel 82.134976
3 2015-08 Resort Hotel 153.238102
4 2015-09 City Hotel 103.745096
In [43]:
fig = px.line(df_adr_time, x = "month-year", y = "adr", color = "hotel", title = "Average daily rate over time", template = "plotly_dark")
fig.show()

Determing the season where maximum guests check in

In [44]:
df_time = df_checked.groupby(["month-year", "hotel"]).size().to_frame().reset_index()
df_time = df_time.rename(columns = {0 : "count"})
In [45]:
fig = px.line(df_time, x = "month-year", y = "count", color = "hotel", title = "Average daily rate over time", template = "plotly_dark")
fig.show()
In [46]:
## Comparing price with check-ins for each hotel

df_count_price = pd.merge(df_adr_time, df_time, on = ["month-year", "hotel"])
In [47]:
rh_count_price = df_count_price[df_count_price["hotel"] == "Resort Hotel"]
ch_count_price = df_count_price[df_count_price["hotel"] == "City Hotel"]
In [48]:
ch_count_price.corr()
Out[48]:
adr count
adr 1.000000 0.804372
count 0.804372 1.000000
In [49]:
rh_count_price.corr()
Out[49]:
adr count
adr 1.000000 -0.131545
count -0.131545 1.000000

Exploring the market segment

In [50]:
rh_checked["market_segment"].value_counts().plot(kind = "pie", legend = True)
Out[50]:
<AxesSubplot:ylabel='market_segment'>
In [51]:
mkt_sgmt_cnt_rh = rh["market_segment"].value_counts().to_frame()
mkt_sgmt_cnt_ch = ch["market_segment"].value_counts().to_frame()

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=mkt_sgmt_cnt_ch.index.tolist(), values=mkt_sgmt_cnt_ch.market_segment.tolist(), name="City Hotel"),
              1, 1)
fig.add_trace(go.Pie(labels=mkt_sgmt_cnt_rh.index.tolist(), values=mkt_sgmt_cnt_rh.market_segment.tolist(), name="Resort Hotel"),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Market Segment Division",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='City Hotel', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='Resort Hotel', x=0.82, y=0.5, font_size=20, showarrow=False)])
fig.show()

Inference: Most bookings are done through online and offline travel agents and tour operators for both the hotels

Determing repeating guests

In [52]:
df_repeated = df[df["is_repeated_guest"] == 1]
In [53]:
repeated_hotel_wise = df.groupby(["hotel","is_repeated_guest"]).size().to_frame().reset_index().rename(columns = {0 : "count"})
fig = px.bar(repeated_hotel_wise, x = "hotel", y = "count", color = "is_repeated_guest", barmode = 'group', title = "Repeated Guests Hotel wise", template = "plotly_dark")
fig.show()

Inference: The number of repeated guests is almost similar in both the hotels even though the number of guests in City Hotel are higher

In [54]:
df_repeated.groupby(["hotel", "is_canceled"]).size()
Out[54]:
hotel         is_canceled
City Hotel    0              1538
              1               439
Resort Hotel  0              1667
              1               111
dtype: int64

Inference: Among repeated guests cancellations are higher for the City Hotel

Exploring customer type

In [55]:
date_df = df.set_index('datetime')
In [56]:
customer_transient = date_df[date_df["customer_type"] == "Transient"]
customer_transient_party = date_df[date_df["customer_type"] == "Transient-Party"]
customer_contract = date_df[date_df["customer_type"] == "Contract"]
customer_group = date_df[date_df["customer_type"] == "Group"]
In [57]:
customer_transient_m = customer_transient.resample("m").sum()
customer_transient_party_m = customer_transient_party.resample("m").sum()
customer_contract_m = customer_contract.resample("m").sum()
customer_group_m = customer_group.resample("m").sum()
In [59]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = customer_transient_m.index, y = customer_transient_m["Total Guests"], name = "Transient Guests"))
fig.add_trace(go.Scatter(x = customer_transient_party_m.index, y = customer_transient_party_m["Total Guests"], name = "Transient Party Guests"))
fig.add_trace(go.Scatter(x = customer_contract_m.index, y = customer_contract_m["Total Guests"], name = "Contract Guests"))
fig.add_trace(go.Scatter(x = customer_group_m.index, y = customer_group_m["Total Guests"], name = "Group Guests"))
fig.update_layout(title = "Number of Guests by Customer Type", xaxis_title = "Arrival Date", yaxis_title = "Number of Guests", width = 1050)
fig.show()

Inference: Transient Guests are much higher in number

Comparing the total duration of stay between City Hotel and Resort Hotel

In [62]:
hist_data = [rh['Total Stay'], ch["Total Stay"]]
group_labels = ["Resort Hotel", "City Hotel"]
fig = ff.create_distplot(hist_data, group_labels, show_rug = False)
fig.show()

Determing what percentage of guests got the room they reserved

In [67]:
got_reserved_rh = np.where(rh["reserved_room_type"] == rh["assigned_room_type"], "Yes", "No")
In [69]:
got_reserved_ch = np.where(ch["reserved_room_type"] == ch["assigned_room_type"], "Yes", "No")
In [74]:
got_reserved_rh_count = pd.Series(got_reserved_rh).value_counts().to_frame()
got_reserved_ch_count = pd.Series(got_reserved_ch).value_counts().to_frame()
In [77]:
got_reserved_rh_count[0]
Out[77]:
Yes    32329
No      7718
Name: 0, dtype: int64
In [78]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=got_reserved_ch_count.index.to_list(), values=got_reserved_ch_count[0].tolist(), name="City Hotel"),
              1, 1)
fig.add_trace(go.Pie(labels=got_reserved_rh_count.index.tolist(), values=got_reserved_rh_count[0].tolist(), name="Resort Hotel"),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="If the assigned room is the same as reserved room",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='City Hotel', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='Resort Hotel', x=0.82, y=0.5, font_size=20, showarrow=False)])
fig.show()

Inference: More number of guests at City Hotel were assigned the reserved room type when compared to the Resort Hotel

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [234]:
df_family = df_checked[(df_checked["children"]>0) | (df_checked["babies"]>0)]
In [238]:
df_family["adults"].value_counts()
Out[238]:
2    5286
1     331
3     315
0     139
4       2
Name: adults, dtype: int64
In [242]:
df_family[df_family["adults"] == 0].babies.value_counts()
Out[242]:
0    137
1      2
Name: babies, dtype: int64
In [ ]:
 
In [ ]:
 

Analysing the correlation among the data

In [16]:
sns.heatmap(data.corr(), annot = True)
Out[16]:
<AxesSubplot:>
In [ ]: